Predicting the Existence of Exoplanets

¶

Sean Hughes

¶

exoplanet animation

Stellar Systems in blue, Confirmed Exoplanetary systems in red


Introduction

¶

In [8]:
#imports
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.animation as animation
import seaborn as sns
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor

#jupyter nbconvert --to html model.ipynb
In [ ]:
#data processing
#csv to pandas
exoDf = pd.read_csv('ExoplanetData.csv')

#remove HD from HD name column, redundant, also remove A
exoDf['hd_name'] = exoDf['hd_name'].str.extract('(\d+)', expand=False)
exoDf['hd_name'] = pd.to_numeric(exoDf['hd_name'], errors='coerce').fillna(exoDf['hd_name'])
exoDf['hd_name'] = exoDf['hd_name'].astype('Int64')
In [ ]:
#csv
hdDf = pd.read_csv('HDCatalogue.csv', sep=';')
In [5]:
#currently there are 121 different columns, lets reduce this to some usable ones.
#lets keep time discovered, name, plus anything that can be applied for regression, 
#leaving out flags, discovery methods, etc. 

#lets make a model that predicts based on our exoDf, the number of planets

#only keep necessary columns

#can make 3d plot of all observed exoplanets

columns_to_keep = ['pl_name', 'hostname', 'hd_name', 'hip_name', 'tic_id',
            'gaia_id', 'sy_snum', 'sy_pnum', 'sy_mnum',
            'st_spectype', 'st_teff', 'st_rad', 'st_mass',
            'st_logg', 'st_age', 'st_dens', 'ra', 'dec', 'sy_dist',]

exoDf = exoDf[columns_to_keep]

#first, we have very small amount of missing data, lets mean impute our mumeric columns.
columns_to_impute = ['st_age', 'st_mass', 'st_dens', 'st_rad', 'st_teff', 'st_logg', 'sy_dist']
for column in columns_to_impute:
    exoDf[column] = exoDf[column].fillna(exoDf[column].mean())
In [6]:
#make a 3d plot of all the stars? test this out

#converts degree in range 0-360 to a radian in range -pi to pi
def deg_from_neg_pi_to_pi(deg):
    return (deg * np.pi / 180) - np.pi

#converts a degree in range -90 to 90 to a radian in range -pi/2 to pi/2
def deg_to_rad_90(deg):
    return (deg * np.pi / 180)

raHd = hdDf['_RAJ2000'].apply(deg_from_neg_pi_to_pi)
raExo = exoDf['ra'].apply(deg_from_neg_pi_to_pi)

decHd = hdDf['_DEJ2000'].apply(deg_to_rad_90)
decExo = exoDf['dec'].apply(deg_to_rad_90)

# Set a higher DPI for better resolution
fig = plt.figure(figsize=(15, 10), dpi=500)
ax = plt.subplot(111, projection='aitoff')

# Initialize the plot with initial data
sc_hd = ax.scatter(raHd, decHd, s=1, c='blue', alpha=0.1)
sc_exo = ax.scatter(raExo, decExo, s=1, c='red', alpha=0.3)
ax.set_xlabel('Right Ascension')
ax.set_ylabel('Declination')
ax.set_title('Systems with Confirmed Exoplanets')
ax.grid(True)
ax.set_facecolor('white')

# Update function for animation
def update(frame):
    global sc_hd, sc_exo, sc_zero_line
    # Shift in radians
    shift_radians = np.deg2rad(frame) % (2 * np.pi)
    
    # Update the RA for HD stars
    new_ra_hd = ((raHd + shift_radians + np.pi) % (2 * np.pi)) - np.pi
    # Update the RA for exoplanets
    new_ra_exo = ((raExo + shift_radians + np.pi) % (2 * np.pi)) - np.pi
    
    # Update the scatter plot data
    sc_hd.set_offsets(np.column_stack((new_ra_hd, decHd)))
    sc_exo.set_offsets(np.column_stack((new_ra_exo, decExo)))

    return sc_hd, sc_exo

#uncomment if you want to make the animation!
#ani = animation.FuncAnimation(fig, update, frames=np.arange(0, 360, 3), interval = 100, blit=True, repeat=True)
#ani.save('stars_animation.gif', writer='ffmpeg', dpi=200)

plt.show()
No description has been provided for this image
In [7]:
#And the animation looks like this!

exoplanet animation

In [7]:
#lets make anothe graph of discovered exoplanets, with earth at the center



#cartesian
x = exoDf['sy_dist'] * np.cos(np.deg2rad(exoDf['dec'])) * np.cos(np.deg2rad(exoDf['ra']))
y = exoDf['sy_dist'] * np.cos(np.deg2rad(exoDf['dec'])) * np.sin(np.deg2rad(exoDf['ra']))
z = exoDf['sy_dist'] * np.sin(np.deg2rad(exoDf['dec']))


# Create a 3D scatter plot
fig = plt.figure(figsize=(12, 8), dpi=200)
ax = fig.add_subplot(111, projection='3d')

#plot
scatter = ax.scatter(x, y, z, c='red', marker='o', alpha=0.1)

# Set labels and title
ax.set_title('Cartesian plot of Confirmed Exoplanet Systems')

ax.set_xlim([-1500, 1500])
ax.set_ylim([-8000, 8000])
ax.set_zlim([-5000, 5000])

ax.set_xticklabels([])
ax.set_yticklabels([])
ax.set_zticklabels([])


def update(frame):
    ax.view_init(elev=10, azim=frame)  # Rotate the view
    return scatter,

#uncomment if you want to make the animation!
#ani = animation.FuncAnimation(fig, update, frames=np.arange(0, 360, 2), interval=100)
#ani.save('circular_3d_plot.gif', writer='pillow', dpi=200, savefig_kwargs={"transparent": True})

plt.show()
No description has been provided for this image
In [ ]:
#And the animation looks like this!

cartesian animation

In [ ]:
#lets make the model for the regression, visualize what we are working with
#relate spectral type and num of exoplanets

#extract just the spectral type symbol
exoDf['st_spectype'] = exoDf['st_spectype'].str[:2]

#group by spectype and find the mean of planets of that spectype
avg_exoplanets = exoDf.groupby('st_spectype')['sy_pnum'].mean()

#sort highest to lowest
avg_exoplanets_sorted = avg_exoplanets.sort_values(ascending=False)

plt.figure(figsize=(15, 10))
avg_exoplanets_sorted.plot(kind='bar', color='green')
plt.xlabel('Spectral Type')
plt.ylabel('Average Number of Exoplanets')
plt.title('Average Number of Exoplanets by Spectral Type')
plt.xticks(rotation=45)
plt.grid(axis='y')
plt.tight_layout()
plt.show()
No description has been provided for this image
In [ ]:
#relate age with num of exoplanets

#mean num of exoplanets for each age
mean_exoplanets_by_age = exoDf.groupby('st_age')['sy_pnum'].mean()

# Filter the DataFrame to include only observations with stellar mass between 0 and 5 
#solar masses, this eliminates outliers in our data
filtered_exoDf = exoDf[(exoDf['st_mass'] >= 0) & (exoDf['st_mass'] <= 5)]
mean_exoplanets_by_mass = filtered_exoDf.groupby('st_mass')['sy_pnum'].mean()

filtered_exoDf = exoDf[(exoDf['st_dens'] >= 0) & (exoDf['st_dens'] <= 150)]
mean_exoplanets_by_density = filtered_exoDf.groupby('st_dens')['sy_pnum'].mean()

mean_exoplanets_by_radius = exoDf.groupby('st_rad')['sy_pnum'].mean()

filtered_exoDf = exoDf[(exoDf['st_teff'] >= 0) & (exoDf['st_teff'] <= 10000)]
mean_exoplanets_by_teff = filtered_exoDf.groupby('st_teff')['sy_pnum'].mean()

filtered_exoDf = exoDf[(exoDf['st_logg'] >= -2) & (exoDf['st_logg'] <= 6)]
mean_exoplanets_by_logg = filtered_exoDf.groupby('st_logg')['sy_pnum'].mean()

dataframes = [mean_exoplanets_by_age, mean_exoplanets_by_mass, mean_exoplanets_by_density,
              mean_exoplanets_by_radius, mean_exoplanets_by_teff, mean_exoplanets_by_logg]

variables = ['Stellar Age', 'Stellar Mass', 'Stellar Density', 'Stellar Radius', 'Stellar Teff', 'Stellar Logg']

# Create subplots
fig, axs = plt.subplots(3, 2, figsize=(15, 15), dpi=500)

# Flatten the axs array
axs = axs.flatten()

# Loop over each DataFrame and subplot
for i, (df, variable) in enumerate(zip(dataframes, variables)):
    # Perform linear regression
    slope, intercept, r_value, p_value, std_error = stats.linregress(df.index, df.values)
    
    # Plot the data and regression line
    axs[i].scatter(df.index.values, df.values, marker='o', color='black', alpha=0.5)
    axs[i].plot(df.index.values, intercept + slope * df.index.values, color='red', linestyle='-', linewidth=2)
    
    # Set plot title and labels
    axs[i].set_title(f'{variable} vs Number of Exoplanets, p={p_value}')
    axs[i].set_xlabel(variable)
    axs[i].set_ylabel('Number of Exoplanets')
    axs[i].grid(True)

# Adjust layout
plt.tight_layout()
plt.show()
No description has been provided for this image
In [ ]:
#lets apply these 6 terms (stellar age, stellar mass, stellar density, 
#  stellar radius, stellar effective temperature, stellar surface gravity) to a 
# regression model, so we can predict the number of exoplanets in a given system

#define features and target
X = exoDf[['st_age', 'st_mass', 'st_dens', 'st_rad', 'st_teff', 'st_logg']]
y = exoDf['sy_pnum']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#model = LinearRegression()
#model = DecisionTreeRegressor()

model = RandomForestRegressor(n_estimators=100)
model2 = ExtraTreesRegressor(n_estimators=200)
model3 = LinearRegression()
model4 = ElasticNet(alpha=0.1, l1_ratio=0.5)
model5 = DummyRegressor(strategy='mean')

#model = ExtraTreesClassifier(n_estimators=100)
model.fit(X_train, y_train)
model2.fit(X_train, y_train)
model3.fit(X_train, y_train)
model4.fit(X_train, y_train)
model5.fit(X_train, y_train)



# Predict the number of exoplanets for the test set
y_pred = model.predict(X_test)
y_pred2 = model2.predict(X_test)
y_pred3 = model3.predict(X_test)
y_pred4 = model4.predict(X_test)
y_pred5 = model5.predict(X_test)

# Calculate and print metrics
mse = mean_squared_error(y_test, y_pred)
mse2 = mean_squared_error(y_test, y_pred2)
mse3 = mean_squared_error(y_test, y_pred3)
mse4 = mean_squared_error(y_test, y_pred4)
mse5 = mean_squared_error(y_test, y_pred5)

r2 = r2_score(y_test, y_pred)
r22 = r2_score(y_test, y_pred2)
r23 = r2_score(y_test, y_pred3)
r24 = r2_score(y_test, y_pred4)
r25 = r2_score(y_test, y_pred5)

print(f"Random Forest MSE: {mse}")
print(f"Random Forest R-squared: {r2}")
print(f"Extra Trees MSE: {mse2}")
print(f"Extra Trees R-squared: {r22}")
print(f"Linear Regression MSE: {mse3}")
print(f"Linear Regression R-squared: {r23}")
print(f"ElasticNet MSE: {mse4}")
print(f"ElasticNet R-squared: {r24}")
print(f"Dummy (mean) MSE: {mse5}")
print(f"Dummy (mean) R-squared: {r25}")

#combine into dataframe for plotting
plot_data = pd.DataFrame({
    'Actual': y_test,
    'Random Forest': y_pred,
    'Extra Trees': y_pred2,
    'Linear Regression': y_pred3,
    'Stochastic Gradient Descent': y_pred4,
    'Dummy (mean)': y_pred5
})

# Melt the DataFrame to long format for Seaborn
plot_data = plot_data.reset_index(drop=True)
plot_data_melted = plot_data.melt(var_name='Type', value_name='Number of Exoplanets')

# Plot the results
plt.figure(figsize=(15, 10), dpi=200)
sns.violinplot(x='Type', y='Number of Exoplanets', data=plot_data_melted, palette='Set2')
plt.xlabel('')
plt.ylabel('Number of Exoplanets')
plt.title('Actual vs Predicted Number of Exoplanets')
plt.grid(True)
plt.show()
Random Forest MSE: 0.3199367634819339
Random Forest R-squared: 0.7798217920302237
Extra Trees MSE: 0.2573534284124437
Extra Trees R-squared: 0.822891198666734
Linear Regression MSE: 1.4288107134911496
Linear Regression R-squared: 0.016702616477324184
ElasticNet MSE: 1.4352209391935493
ElasticNet R-squared: 0.012291144683724275
Dummy (mean) MSE: 2.240789656610594
Dummy (mean) R-squared: -0.5420955243162937
No description has been provided for this image
In [ ]:
#remove linear/elasticnet
plot_data = pd.DataFrame({
    'Actual': y_test,
    'Random Forest': y_pred,
    'Extra Trees': y_pred2,
})

# Melt the DataFrame to long format for Seaborn
plot_data = plot_data.reset_index(drop=True)
plot_data_melted = plot_data.melt(var_name='Type', value_name='Number of Exoplanets')

#print MSEand r^2 values
print(f"Random Forest MSE: {mse}")
print(f"Random Forest R-squared: {r2}")
print(f"Extra Trees MSE: {mse2}")
print(f"Extra Trees R-squared: {r22}")

# Plot the results
plt.figure(figsize=(15, 10), dpi=200)
sns.violinplot(x='Type', y='Number of Exoplanets', data=plot_data_melted, palette='Set2')
plt.xlabel('')
plt.ylabel('Number of Exoplanets')
plt.title('Actual vs Predicted Number of Exoplanets')
plt.grid(True)
plt.show()
Random Forest MSE: 0.3199367634819339
Random Forest R-squared: 0.7798217920302237
Extra Trees MSE: 0.2573534284124437
Extra Trees R-squared: 0.822891198666734
No description has been provided for this image